InΒ [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px
import warnings
warnings.filterwarnings("ignore")
%matplotlib inline
InΒ [2]:
df = pd.read_csv('Unemployment in India.csv')
InΒ [3]:
print(df)
Region Date Frequency Estimated Unemployment Rate (%) \
0 Andhra Pradesh 31-05-2019 Monthly 3.65
1 Andhra Pradesh 30-06-2019 Monthly 3.05
2 Andhra Pradesh 31-07-2019 Monthly 3.75
3 Andhra Pradesh 31-08-2019 Monthly 3.32
4 Andhra Pradesh 30-09-2019 Monthly 5.17
.. ... ... ... ...
763 NaN NaN NaN NaN
764 NaN NaN NaN NaN
765 NaN NaN NaN NaN
766 NaN NaN NaN NaN
767 NaN NaN NaN NaN
Estimated Employed Estimated Labour Participation Rate (%) Area
0 11999139.0 43.24 Rural
1 11755881.0 42.05 Rural
2 12086707.0 43.50 Rural
3 12285693.0 43.97 Rural
4 12256762.0 44.68 Rural
.. ... ... ...
763 NaN NaN NaN
764 NaN NaN NaN
765 NaN NaN NaN
766 NaN NaN NaN
767 NaN NaN NaN
[768 rows x 7 columns]
InΒ [4]:
#first five rows and columns in dataset
df.head()
Out[4]:
| Region | Date | Frequency | Estimated Unemployment Rate (%) | Estimated Employed | Estimated Labour Participation Rate (%) | Area | |
|---|---|---|---|---|---|---|---|
| 0 | Andhra Pradesh | 31-05-2019 | Monthly | 3.65 | 11999139.0 | 43.24 | Rural |
| 1 | Andhra Pradesh | 30-06-2019 | Monthly | 3.05 | 11755881.0 | 42.05 | Rural |
| 2 | Andhra Pradesh | 31-07-2019 | Monthly | 3.75 | 12086707.0 | 43.50 | Rural |
| 3 | Andhra Pradesh | 31-08-2019 | Monthly | 3.32 | 12285693.0 | 43.97 | Rural |
| 4 | Andhra Pradesh | 30-09-2019 | Monthly | 5.17 | 12256762.0 | 44.68 | Rural |
InΒ [5]:
#last five rows and columns in datset
df.tail()
Out[5]:
| Region | Date | Frequency | Estimated Unemployment Rate (%) | Estimated Employed | Estimated Labour Participation Rate (%) | Area | |
|---|---|---|---|---|---|---|---|
| 763 | NaN | NaN | NaN | NaN | NaN | NaN | NaN |
| 764 | NaN | NaN | NaN | NaN | NaN | NaN | NaN |
| 765 | NaN | NaN | NaN | NaN | NaN | NaN | NaN |
| 766 | NaN | NaN | NaN | NaN | NaN | NaN | NaN |
| 767 | NaN | NaN | NaN | NaN | NaN | NaN | NaN |
InΒ [6]:
#total number of rows and columns in dataset
df.shape
Out[6]:
(768, 7)
InΒ [7]:
#checking for missing values
df.isnull().sum()
Out[7]:
Region 28 Date 28 Frequency 28 Estimated Unemployment Rate (%) 28 Estimated Employed 28 Estimated Labour Participation Rate (%) 28 Area 28 dtype: int64
InΒ [8]:
#dropping missing values
df.dropna(inplace=True)
InΒ [9]:
df.isnull().sum()
Out[9]:
Region 0 Date 0 Frequency 0 Estimated Unemployment Rate (%) 0 Estimated Employed 0 Estimated Labour Participation Rate (%) 0 Area 0 dtype: int64
InΒ [10]:
#column's data type
df.dtypes
Out[10]:
Region object Date object Frequency object Estimated Unemployment Rate (%) float64 Estimated Employed float64 Estimated Labour Participation Rate (%) float64 Area object dtype: object
InΒ [11]:
#names of columns in dataset
df.columns
Out[11]:
Index(['Region', ' Date', ' Frequency', ' Estimated Unemployment Rate (%)',
' Estimated Employed', ' Estimated Labour Participation Rate (%)',
'Area'],
dtype='object')
InΒ [12]:
#removing empty spaces before and after the column names
df.columns=df.columns.str.strip()
df
Out[12]:
| Region | Date | Frequency | Estimated Unemployment Rate (%) | Estimated Employed | Estimated Labour Participation Rate (%) | Area | |
|---|---|---|---|---|---|---|---|
| 0 | Andhra Pradesh | 31-05-2019 | Monthly | 3.65 | 11999139.0 | 43.24 | Rural |
| 1 | Andhra Pradesh | 30-06-2019 | Monthly | 3.05 | 11755881.0 | 42.05 | Rural |
| 2 | Andhra Pradesh | 31-07-2019 | Monthly | 3.75 | 12086707.0 | 43.50 | Rural |
| 3 | Andhra Pradesh | 31-08-2019 | Monthly | 3.32 | 12285693.0 | 43.97 | Rural |
| 4 | Andhra Pradesh | 30-09-2019 | Monthly | 5.17 | 12256762.0 | 44.68 | Rural |
| ... | ... | ... | ... | ... | ... | ... | ... |
| 749 | West Bengal | 29-02-2020 | Monthly | 7.55 | 10871168.0 | 44.09 | Urban |
| 750 | West Bengal | 31-03-2020 | Monthly | 6.67 | 10806105.0 | 43.34 | Urban |
| 751 | West Bengal | 30-04-2020 | Monthly | 15.63 | 9299466.0 | 41.20 | Urban |
| 752 | West Bengal | 31-05-2020 | Monthly | 15.22 | 9240903.0 | 40.67 | Urban |
| 753 | West Bengal | 30-06-2020 | Monthly | 9.86 | 9088931.0 | 37.57 | Urban |
740 rows Γ 7 columns
InΒ [13]:
#checking for duplicate values
print(df.duplicated().sum())
0
InΒ [14]:
#detailed information about dataset
df.info()
<class 'pandas.core.frame.DataFrame'> Index: 740 entries, 0 to 753 Data columns (total 7 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 Region 740 non-null object 1 Date 740 non-null object 2 Frequency 740 non-null object 3 Estimated Unemployment Rate (%) 740 non-null float64 4 Estimated Employed 740 non-null float64 5 Estimated Labour Participation Rate (%) 740 non-null float64 6 Area 740 non-null object dtypes: float64(3), object(4) memory usage: 46.2+ KB
InΒ [15]:
#Descriptive statistics
df.describe()
Out[15]:
| Estimated Unemployment Rate (%) | Estimated Employed | Estimated Labour Participation Rate (%) | |
|---|---|---|---|
| count | 740.000000 | 7.400000e+02 | 740.000000 |
| mean | 11.787946 | 7.204460e+06 | 42.630122 |
| std | 10.721298 | 8.087988e+06 | 8.111094 |
| min | 0.000000 | 4.942000e+04 | 13.330000 |
| 25% | 4.657500 | 1.190404e+06 | 38.062500 |
| 50% | 8.350000 | 4.744178e+06 | 41.160000 |
| 75% | 15.887500 | 1.127549e+07 | 45.505000 |
| max | 76.740000 | 4.577751e+07 | 72.570000 |
InΒ [16]:
#Adding column to dataset
df['Date'] = pd.to_datetime(df['Date'])
df['MM YYYY'] = df['Date'].dt.strftime('%m %Y')
InΒ [17]:
df
Out[17]:
| Region | Date | Frequency | Estimated Unemployment Rate (%) | Estimated Employed | Estimated Labour Participation Rate (%) | Area | MM YYYY | |
|---|---|---|---|---|---|---|---|---|
| 0 | Andhra Pradesh | 2019-05-31 | Monthly | 3.65 | 11999139.0 | 43.24 | Rural | 05 2019 |
| 1 | Andhra Pradesh | 2019-06-30 | Monthly | 3.05 | 11755881.0 | 42.05 | Rural | 06 2019 |
| 2 | Andhra Pradesh | 2019-07-31 | Monthly | 3.75 | 12086707.0 | 43.50 | Rural | 07 2019 |
| 3 | Andhra Pradesh | 2019-08-31 | Monthly | 3.32 | 12285693.0 | 43.97 | Rural | 08 2019 |
| 4 | Andhra Pradesh | 2019-09-30 | Monthly | 5.17 | 12256762.0 | 44.68 | Rural | 09 2019 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 749 | West Bengal | 2020-02-29 | Monthly | 7.55 | 10871168.0 | 44.09 | Urban | 02 2020 |
| 750 | West Bengal | 2020-03-31 | Monthly | 6.67 | 10806105.0 | 43.34 | Urban | 03 2020 |
| 751 | West Bengal | 2020-04-30 | Monthly | 15.63 | 9299466.0 | 41.20 | Urban | 04 2020 |
| 752 | West Bengal | 2020-05-31 | Monthly | 15.22 | 9240903.0 | 40.67 | Urban | 05 2020 |
| 753 | West Bengal | 2020-06-30 | Monthly | 9.86 | 9088931.0 | 37.57 | Urban | 06 2020 |
740 rows Γ 8 columns
InΒ [18]:
#counting unique values of frequency
df.value_counts('Frequency')
Out[18]:
Frequency Monthly 381 Monthly 359 Name: count, dtype: int64
InΒ [19]:
#replacing ' Monthly' to 'Monthly'
df['Frequency']=df['Frequency'].replace(' Monthly','Monthly')
InΒ [20]:
df.value_counts('Frequency')
Out[20]:
Frequency Monthly 740 Name: count, dtype: int64
InΒ [21]:
#counting unique values in region
df.value_counts('Region')
Out[21]:
Region Andhra Pradesh 28 Karnataka 28 Uttar Pradesh 28 Tripura 28 Telangana 28 Tamil Nadu 28 Rajasthan 28 Punjab 28 Odisha 28 Maharashtra 28 Kerala 28 Madhya Pradesh 28 Jharkhand 28 Himachal Pradesh 28 Haryana 28 Gujarat 28 Delhi 28 Chhattisgarh 28 Bihar 28 West Bengal 28 Meghalaya 27 Uttarakhand 27 Assam 26 Puducherry 26 Goa 24 Jammu & Kashmir 21 Sikkim 17 Chandigarh 12 Name: count, dtype: int64
InΒ [22]:
df['Region'].nunique()
Out[22]:
28
InΒ [23]:
#counting unique values in area
df.value_counts('Area')
Out[23]:
Area Urban 381 Rural 359 Name: count, dtype: int64
InΒ [24]:
area_count = df['Area'].value_counts()
fig = px.pie(area_count,
values=area_count.values,
names=area_count.index,
title='Area Distribution',
hole=0.3)
fig.show()
InΒ [25]:
region_count = df['Region'].value_counts()
fig = px.pie(region_count,
values=region_count.values,
names=region_count.index,
title='Region Distribution',
hole=0.3)
fig.show()
InΒ [26]:
rural = df[df.Area == 'Rural']
urban = df[df.Area == 'Urban']
InΒ [27]:
rural.pivot_table(index = 'MM YYYY', values = 'Estimated Unemployment Rate (%)', aggfunc = np.mean)
Out[27]:
| Estimated Unemployment Rate (%) | |
|---|---|
| MM YYYY | |
| 01 2020 | 7.842692 |
| 02 2020 | 8.752308 |
| 03 2020 | 9.683333 |
| 04 2020 | 21.746000 |
| 05 2019 | 7.068077 |
| 05 2020 | 21.210800 |
| 06 2019 | 8.201154 |
| 06 2020 | 11.825200 |
| 07 2019 | 7.741923 |
| 08 2019 | 8.503077 |
| 09 2019 | 7.036800 |
| 10 2019 | 9.051111 |
| 11 2019 | 8.432222 |
| 12 2019 | 8.233600 |
InΒ [28]:
plt.figure(figsize = (14,7))
sns.barplot(x = 'MM YYYY', y = 'Estimated Unemployment Rate (%)', data = rural, errorbar=('ci',0), palette='pastel')
plt.xlabel('MM YYYY')
plt.ylabel('Unemployment Rate (%)')
plt.title("Rural - Unemployment Rate (%)");
plt.xticks(rotation=45)
plt.show()
InΒ [29]:
plt.figure(figsize = (14,7))
sns.barplot(x = 'MM YYYY', y = 'Estimated Unemployment Rate (%)', data = urban, errorbar=('ci', 0))
plt.xlabel('Month-Year')
plt.ylabel('Unemployment Rate (%)')
plt.title("Urban - Unemployment Rate (%)");
InΒ [30]:
fig = px.sunburst(df, path=['MM YYYY', 'Area'], values='Estimated Unemployment Rate (%)', title='Sunburst Plot of Unemployment Rate Comparison')
fig.show()
InΒ [31]:
from matplotlib.ticker import FuncFormatter
rural = pd.DataFrame(df)
def millions_formatter(x, pos):
return f'{x*1e-6:.1f}M'
plt.figure(figsize=(10, 5))
sns.barplot(x='MM YYYY', y='Estimated Employed', data=rural, errorbar=('ci', 0), palette='Set2')
plt.xlabel('Month-Year')
plt.ylabel('Estimated Employed')
plt.xticks(rotation=45)
plt.title("Rural - Estimated Employed")
plt.gca().yaxis.set_major_formatter(FuncFormatter(millions_formatter))
plt.show()
InΒ [32]:
from matplotlib.ticker import FuncFormatter
rural = pd.DataFrame(df)
def millions_formatter(x, pos):
return f'{x*1e-6:.1f}M'
plt.figure(figsize=(10, 5))
sns.barplot(x='MM YYYY', y='Estimated Employed', data=urban, errorbar=('ci', 0), palette='Set2')
plt.xlabel('Month-Year')
plt.ylabel('Estimated Employed')
plt.xticks(rotation=45)
plt.title("Urban - Estimated Employed")
plt.gca().yaxis.set_major_formatter(FuncFormatter(millions_formatter))
plt.show()
InΒ [33]:
fig = px.sunburst(df, path=['MM YYYY', 'Area'], values='Estimated Employed', title='Sunburst Plot of Employment Rate Comparison')
fig.show()
InΒ [34]:
plt.figure(figsize = (12,6))
sns.barplot(x = 'MM YYYY', y = 'Estimated Labour Participation Rate (%)', data = rural, errorbar=('ci', 0), palette='Set2')
plt.xlabel('Month-Year')
plt.ylabel('Labour Participation Rate')
plt.title("Rural - Labour Participation Rate");
InΒ [35]:
plt.figure(figsize = (12,6))
sns.barplot(x = 'MM YYYY', y = 'Estimated Labour Participation Rate (%)', data = urban,errorbar=('ci', 0),palette='Set2')
plt.xlabel('Month-Year')
plt.ylabel('Labour Participation Rate (%)')
plt.title("Urban - Labour Participation Rate");
InΒ [36]:
px.scatter(df,x='MM YYYY',y='Estimated Labour Participation Rate (%)',color='Area')
InΒ [37]:
#Estimated unemployment rate over time
plt.figure(figsize=(12, 6))
sns.lineplot(x='MM YYYY', y='Estimated Unemployment Rate (%)', data=df, marker='o')
plt.title('Estimated Unemployment Rate (%) Over Time')
plt.xlabel('MM YYYY')
plt.ylabel('Estimated Unemployment Rate (%)')
plt.xticks(rotation=45)
plt.grid(True)
plt.show()
InΒ [38]:
figure = px.bar(df, x = 'Date', y = 'Estimated Labour Participation Rate (%)', color = 'Date', title = 'Estimated Labour Participation Rate (%)')
figure.show()
InΒ [39]:
#Average, highest and lowest unemployment rate in India
#calculating average unemployment rate by region
average_unemployment_rate = df.groupby('Region')['Estimated Unemployment Rate (%)'].mean()
#State with highest unemployment rate
state_with_highest_unemployment_rate = average_unemployment_rate.idxmax()
highest_unemployment_rate = average_unemployment_rate.max()
#State with lowest unemployment rate
state_with_lowest_unemployment_rate = average_unemployment_rate.idxmin()
lowest_unemployment_rate = average_unemployment_rate.min()
print("State with highest unemployment rate:", state_with_highest_unemployment_rate)
print("Highest unemployment rate:", highest_unemployment_rate)
print("State with lowest unemployment rate:", state_with_lowest_unemployment_rate)
print("Lowest unemployment rate:", lowest_unemployment_rate)
State with highest unemployment rate: Tripura Highest unemployment rate: 28.350357142857142 State with lowest unemployment rate: Meghalaya Lowest unemployment rate: 4.7988888888888885
InΒ [40]:
sns.set_palette("Set1")
plt.figure(figsize=(12, 6))
average_unemployment_rate.sort_values(ascending=False).plot(kind='bar')
plt.title("Average Unemployment Rate by State")
plt.xlabel("Region")
plt.ylabel("Average Unemployment Rate (%)")
plt.xticks(rotation=90)
plt.show()
InΒ [41]:
figure = px.bar(df, x = 'Date', y = 'Estimated Employed', color = 'Date', title = 'Estimated Employed People')
figure.show()
InΒ [42]:
#Average, highest and lowest employment rate in India
#calculating average employment rate by region
average_employment_rate = df.groupby('Region')['Estimated Employed'].mean()
#State with highest unemployment rate
state_with_highest_employment_rate = average_employment_rate.idxmax()
highest_employment_rate = average_employment_rate.max()
#State with lowest unemployment rate
state_with_lowest_employment_rate = average_employment_rate.idxmin()
lowest_employment_rate = average_employment_rate.min()
print("State with highest employment rate:", state_with_highest_employment_rate)
print("Highest employment rate:", highest_employment_rate)
print("State with lowest employment rate:", state_with_lowest_employment_rate)
print("Lowest employment rate:", lowest_employment_rate)
State with highest employment rate: Uttar Pradesh Highest employment rate: 28094832.17857143 State with lowest employment rate: Sikkim Lowest employment rate: 106880.70588235294
InΒ [43]:
from matplotlib.ticker import FuncFormatter
rural = pd.DataFrame(df)
def millions_formatter(x, pos):
return f'{x*1e-6:.1f}M'
sns.set_palette('Set2')
plt.figure(figsize=(10, 5))
average_employment_rate.sort_values(ascending=False).plot(kind='bar')
plt.xlabel('Regionr')
plt.ylabel('Average Employment Rate (%)')
plt.xticks(rotation=90)
plt.title("Average Employment Rate by State")
plt.gca().yaxis.set_major_formatter(FuncFormatter(millions_formatter))
plt.show()
InΒ [44]:
#Average, highest and lowest employment rate in India
#calculating average employment rate by region
average_labour_participation_rate = df.groupby('Region')['Estimated Labour Participation Rate (%)'].mean()
#State with highest unemployment rate
state_with_highest_labour_participation_rate = average_labour_participation_rate.idxmax()
highest_labour_participation_rate = average_labour_participation_rate.max()
#State with lowest unemployment rate
state_with_lowest_labour_participation_rate = average_labour_participation_rate.idxmin()
lowest_labour_participation_rate = average_labour_participation_rate.min()
print("State with highest labour participation rate:", state_with_highest_labour_participation_rate)
print("Highest labour participation rate:", highest_labour_participation_rate)
print("State with lowest labour participation rate:", state_with_lowest_labour_participation_rate)
print("Lowest labour participation rate:", lowest_labour_participation_rate)
State with highest labour participation rate: Tripura Highest labour participation rate: 61.82392857142857 State with lowest labour participation rate: Uttarakhand Lowest labour participation rate: 33.775555555555556
InΒ [45]:
sns.set_palette("viridis")
plt.figure(figsize=(12, 6))
average_labour_participation_rate.sort_values(ascending=False).plot(kind='bar')
plt.title("Average labour participation Rate by State")
plt.xlabel("Region")
plt.ylabel("Average labour participation Rate (%)")
plt.xticks(rotation=90)
plt.show()
InΒ [46]:
fig = px.sunburst(df, path=['Area', 'Region'], values='Estimated Unemployment Rate (%)', title='Sunburst Plot of Estimated Unemployment Rate by Region and Area')
fig.show()
InΒ [47]:
correlation = df[['Estimated Unemployment Rate (%)', 'Estimated Employed', 'Estimated Labour Participation Rate (%)']].corr()
plt.figure(figsize=(10,8))
sns.heatmap(correlation, cmap='coolwarm')
for i in range(correlation.shape[0]):
for j in range(correlation.shape[1]):
plt.text(j+0.5, i+0.5, "{:.2f}".format(correlation.iloc[i, j]), ha='center', va='center', color='white')
plt.title('Correlation Heatmap')
plt.show()
Conclusion:ΒΆ
- The unemployment rate in rural areas exceeded that in urban areas from May 2019 to May 2020.
- However, by June 2020, the unemployment rates in both urban and rural areas had nearly equalized.
- Notably, in April and May 2020, the unemployment rate surged significantly, coinciding with the onset of the COVID-19 pandemic's economic impact.
- The rural areas have a higher employed population compared to urban areas.
- However, there was a sudden decrease in the employed population during April and May 2020.
- The labor participation rate is higher in rural areas than in urban areas.
- However, there was a sudden decrease in the labor participation rate in April 2020.
- The state with highest unemployment rate is Tripura, while the state with lowest unemployment rate is Meghalaya.
- The state with highest employment rate is Uttar Pradesh, while the state with lowest employment rate is Sikkim.
- The state with highest labour participation rate is Tripura, while the state with lowest labour participation rate is uttarakhand.
- Most of the people are employed at January 31,2020
- From the given data set, the labour participation rate was high in October and November 2019.